Code
import pandas as pd
eda = pd.read_parquet("data/eda.parquet")# identifying data analyst jobs by keyword searching
keywords = ['Data Analyst', 'Business Analyst', 'Data Engineering', 'Deep Learning',
'Data Science', 'Data Analysis','Data Analytics', 'Market Research Analyst'
'LLM', 'Language Model', 'NLP', 'Natural Language Processing',
'Computer Vision', 'Business Intelligence Analyst', 'Quantitative Analyst', 'Operations Analyst']
match = lambda col: eda[col].str.contains('|'.join(keywords), case=False, na=False)
eda['DATA_ANALYST_JOB'] = match('TITLE_NAME') \
| match('SKILLS_NAME') \
| match('SPECIALIZED_SKILLS_NAME')
eda['DATA_ANALYST_JOB'].value_counts()DATA_ANALYST_JOB
False 38212
True 33042
Name: count, dtype: int64
import plotly.graph_objects as go
from plotly.subplots import make_subplots
df_grouped = (
eda
.groupby(['DATA_ANALYST_JOB','NAICS2_NAME'])
.size()
.reset_index(name='Job_Count')
)
short_names = {
'Professional, Scientific, and Technical Services': 'Tech. Services',
'Administrative and Support and Waste Management and Remediation Services': 'Admin & Waste Mgmt',
'Health Care and Social Assistance': 'Healthcare',
'Finance and Insurance': 'Finance',
'Information': 'Info Tech',
'Educational Services': 'Education',
'Manufacturing': 'Manufacturing',
'Retail Trade': 'Retail',
'Accommodation and Food Services': 'Hospitality',
'Other Services (except Public Administration)': 'Other Services'
}
df_grouped['Industry'] = df_grouped['NAICS2_NAME'].map(short_names).fillna(df_grouped['NAICS2_NAME'])
df_grouped['Job_Type'] = df_grouped['DATA_ANALYST_JOB'].map({True:'True', False:'False'})
pivot = (
df_grouped
.pivot_table(index='Industry', columns='Job_Type', values='Job_Count', fill_value=0)
.reset_index()
)
industries = pivot['Industry'].tolist()
y_true = pivot['True'].tolist()
y_false = pivot['False'].tolist()
# 2) Build a 2-row subplot: bar on top, table below
fig = make_subplots(
rows=2, cols=1,
row_heights=[0.70, 0.30], # give a bit more room to the table
specs=[[{"type":"bar"}],[{"type":"table"}]],
vertical_spacing=0.12 # more space between bar and table
)
colors = {'True': '#FFE5E5', 'False': '#FF6B6B'}
fig.add_trace(
go.Bar(
x=industries, y=y_true, name='True',
marker=dict(color=colors['True'], line=dict(color='#A81D1D', width=1)),
text=y_true, textposition='outside'
),
row=1, col=1
)
fig.add_trace(
go.Bar(
x=industries, y=y_false, name='False',
marker=dict(color=colors['False'], line=dict(color='#A81D1D', width=1)),
text=y_false, textposition='outside'
),
row=1, col=1
)
# 3) Slider steps: 0 → 8 000 in 200s
steps = []
for val in range(0, 8001, 200):
steps.append(dict(
label=str(val),
method="update",
args=[
{"y": [
[v if v>=val else 0 for v in y_true],
[v if v>=val else 0 for v in y_false]
]}
]
))
# 4) Final layout tweaks
fig.update_layout(
# lift slider above everything
sliders=[dict(
active=0,
currentvalue={"prefix":"Min Jobs: "},
pad={"b":0},
x=0.05,
y=1.05, # move slider way above the plot area
xanchor="left",
yanchor="bottom",
len=0.7,
font=dict(color='#A81D1D'),
steps=steps
)],
title=dict(
text="Data & Business Analytics Job Trends",
font=dict(size=24, color='#A81D1D'),
x=0.5,
y=0.95, # drop the title just below the slider
xanchor="center",
yanchor="top"
),
width=1100, height=850,
margin=dict(l=60, r=60, t=180, b=200), # extra top & bottom margin
plot_bgcolor='white',
paper_bgcolor='white',
xaxis=dict(
title="Industry",
title_font=dict(size=16, color='#A81D1D'),
tickmode='array',
tickvals=list(range(len(industries))),
ticktext=industries,
tickangle=-30,
tickfont=dict(size=11, color='#333'),
showline=True, linecolor='#A81D1D'
),
yaxis=dict(
title="Number of Jobs",
title_font=dict(size=16, color='#A81D1D'),
tickfont=dict(size=11, color='#333'),
gridcolor='rgba(200,200,200,0.3)',
showline=True, linecolor='#A81D1D',
range=[0, max(max(y_true),max(y_false))*1.2]
),
legend=dict(
title="Data Analyst Job",
title_font=dict(color='#A81D1D'),
font=dict(size=12),
x=0.95, y=0.95
),
bargap=0.2
)
fig.write_html(
"figures/edaplot1.html",
include_plotlyjs="cdn", # Use CDN to load Plotly JS
full_html=False # Only include the plot div
)import plotly.express as px
import pandas as pd
# Assuming eda is already loaded and DATA_ANALYST_JOB is defined
df = eda.copy()
# Step 1: Map DATA_ANALYST_JOB to labels
df['Job_Category'] = df['DATA_ANALYST_JOB'].map({True: 'Analytics Job', False: 'Non-Analytics Job'})
# Step 2: Clean the data (remove rows with missing SPECIALIZED_SKILLS_NAME)
df = df.dropna(subset=['SPECIALIZED_SKILLS_NAME'])
# Debug: Check the number of rows after cleaning
print("Number of rows after cleaning:", len(df))
# Step 3: Split the SPECIALIZED_SKILLS_NAME into individual skills
# Assuming SPECIALIZED_SKILLS_NAME is a string of skills separated by commas or another delimiter
df_skills = df.copy()
df_skills['SPECIALIZED_SKILLS_NAME'] = df_skills['SPECIALIZED_SKILLS_NAME'].str.split(',') # Adjust delimiter if needed
df_skills = df_skills.explode('SPECIALIZED_SKILLS_NAME')
df_skills['SPECIALIZED_SKILLS_NAME'] = df_skills['SPECIALIZED_SKILLS_NAME'].str.strip()
# Step 4: Group by skill and Job_Category to get the count
df_skills_count = df_skills.groupby(['SPECIALIZED_SKILLS_NAME', 'Job_Category']).size().reset_index(name='Count')
# Step 5: Get the top 10 skills by total count
top_skills = df_skills_count.groupby('SPECIALIZED_SKILLS_NAME')['Count'].sum().nlargest(10).index
df_skills_top = df_skills_count[df_skills_count['SPECIALIZED_SKILLS_NAME'].isin(top_skills)]
# Debug: Check the grouped data
print("Top 10 specialized skills:")
print(df_skills_top)
# Step 6: Create the bar plot
fig = px.bar(
df_skills_top,
x='Count',
y='SPECIALIZED_SKILLS_NAME',
color='Job_Category',
barmode='stack',
color_discrete_map={'Analytics Job': '#FF6B6B', 'Non-Analytics Job': '#4ECDC4'},
title='Top 10 Specialized Skills by Job Category'
)
# Step 7: Update layout for styling
fig.update_layout(
width=900,
height=600,
plot_bgcolor='white',
paper_bgcolor='white',
font=dict(family='Inter, sans-serif', size=14, color='#2D3748'),
title=dict(
font=dict(size=24, color='#FF6B6B'),
x=0.5,
xanchor='center',
y=0.99,
yanchor='top'
),
xaxis=dict(
title='Number of Jobs',
title_font=dict(size=16),
tickfont=dict(size=12),
gridcolor='#E2E8F0',
linecolor='#2D3748',
linewidth=2,
showline=True,
showgrid=True,
zeroline=False
),
yaxis=dict(
title='Specialized Skill',
title_font=dict(size=16),
tickfont=dict(size=12)
),
legend=dict(
title='Job Category',
font=dict(size=13),
bgcolor='#FFFFFF',
bordercolor='#FF6B6B',
borderwidth=1,
x=1.02,
y=0.5,
xanchor='left',
yanchor='middle'
)
)
# Save to HTML
fig.write_html(
'figures/edaplot2.html',
include_plotlyjs='cdn',
full_html=False
)Number of rows after cleaning: 71254
Top 10 specialized skills:
SPECIALIZED_SKILLS_NAME Job_Category Count
2071 "Business Intelligence" Analytics Job 8077
2072 "Business Intelligence" Non-Analytics Job 1778
2167 "Business Process" Analytics Job 4417
2168 "Business Process" Non-Analytics Job 8385
2180 "Business Requirements" Analytics Job 4972
2181 "Business Requirements" Non-Analytics Job 7740
4151 "Dashboard" Analytics Job 9975
4152 "Dashboard" Non-Analytics Job 1559
4173 "Data Analysis" Analytics Job 25620
6640 "Finance" Analytics Job 5573
6641 "Finance" Non-Analytics Job 6220
13332 "Project Management" Analytics Job 6236
13333 "Project Management" Non-Analytics Job 6939
13594 "Python (Programming Language)" Analytics Job 10127
13595 "Python (Programming Language)" Non-Analytics Job 1703
14459 "SAP Applications" Analytics Job 2174
14460 "SAP Applications" Non-Analytics Job 9149
14686 "SQL (Programming Language)" Analytics Job 16067
14687 "SQL (Programming Language)" Non-Analytics Job 4202
import plotly.express as px
import pandas as pd
# Prepare the data
df = eda.copy()
# Define analytics jobs (Data Analyst + Business Analyst)
def classify_analytics_job(row):
if row['DATA_ANALYST_JOB']:
return True
title = str(row['TITLE_NAME']).lower() if 'TITLE_NAME' in row else str(row['TITLE']).lower()
if 'business analyst' in title:
return True
return False
df['IS_ANALYTICS_JOB'] = df.apply(classify_analytics_job, axis=1)
df['Job_Category'] = df['IS_ANALYTICS_JOB'].map({True: 'Analytics Job', False: 'Non-Analytics Job'})
# Calculate average years of experience
df['Avg_Years_Experience'] = (df['MIN_YEARS_EXPERIENCE'] + df['MAX_YEARS_EXPERIENCE']) / 2
# Clean the data (remove rows with missing salary or experience)
df = df.dropna(subset=['Avg_Years_Experience', 'SALARY'])
# Create the scatter plot with trend line
fig = px.scatter(df,
x='Avg_Years_Experience',
y='SALARY',
color='Job_Category',
trendline='ols', # Add trend line (ordinary least squares)
title='Experience Requirements vs Salary for Analytics Jobs',
labels={'Avg_Years_Experience': 'Average Years of Experience', 'SALARY': 'Salary ($)', 'Job_Category': 'Job Category'},
color_discrete_map={'Analytics Job': '#FF6B6B', 'Non-Analytics Job': '#4ECDC4'})
# Beautify the layout with a red-white theme (no gradients)
fig.update_layout(
width=900,
height=600,
plot_bgcolor='#FFFFFF', # Plain white background
paper_bgcolor='#FFFFFF', # Plain white background
font=dict(family="Inter, sans-serif", size=14, color="#2D3748"),
title=dict(
font=dict(size=24, color="#FF6B6B"), # Red title for theme
x=0.5,
xanchor="center",
y=0.95,
yanchor="top"
),
xaxis=dict(
title="Average Years of Experience",
title_font=dict(size=16),
tickfont=dict(size=12),
gridcolor="#E2E8F0",
linecolor="#2D3748",
linewidth=2,
showline=True,
showgrid=True,
zeroline=False
),
yaxis=dict(
title="Salary ($)",
title_font=dict(size=16),
tickfont=dict(size=12),
gridcolor="#E2E8F0",
linecolor="#2D3748",
linewidth=2,
showline=True,
showgrid=True,
zeroline=False
),
legend=dict(
title="Job Category",
font=dict(size=13),
bgcolor="#FFFFFF",
bordercolor="#FF6B6B", # Red border for theme
borderwidth=1,
x=1.02,
y=0.5,
xanchor="left",
yanchor="middle"
),
hovermode="closest",
hoverlabel=dict(
bgcolor="#FFFFFF",
font_size=12,
font_family="Inter, sans-serif",
font_color="#2D3748",
bordercolor="#FF6B6B" # Red border for hover
)
)
# Customize scatter points
fig.update_traces(
marker=dict(
size=8,
opacity=0.7,
line=dict(width=1, color="#2D3748")
)
)
fig.write_html(
"figures/edaplot3.html",
include_plotlyjs="cdn", # Use CDN to load Plotly JS
full_html=False # Only include the plot div
)import plotly.express as px
import pandas as pd
# Assuming eda is already loaded and DATA_ANALYST_JOB is defined
df = eda.copy()
# Step 1: Clean the data (remove rows with missing STATE_NAME)
df = df.dropna(subset=['STATE_NAME'])
# Debug: Check the number of rows after cleaning and unique states
print("Number of rows after cleaning:", len(df))
print("Unique states extracted:", df['STATE_NAME'].unique())
# Step 2: Map DATA_ANALYST_JOB to labels
df['Job_Category'] = df['DATA_ANALYST_JOB'].map({True: 'Analytics Job', False: 'Non-Analytics Job'})
# Step 3: Aggregate data by state and job category
df_state_counts = df.groupby(['STATE_NAME', 'Job_Category']).size().reset_index(name='Job_Count')
# Step 4: Pivot the data to get counts for Analytics and Non-Analytics Jobs
df_pivot = df_state_counts.pivot(index='STATE_NAME', columns='Job_Category', values='Job_Count').fillna(0)
df_pivot['Total_Jobs'] = df_pivot.get('Analytics Job', 0) + df_pivot.get('Non-Analytics Job', 0)
df_pivot = df_pivot.reset_index()
# Debug: Check the aggregated data
print("Aggregated data by state:")
print(df_pivot)
# Step 5: Find states with minimum and maximum jobs
min_jobs_row = df_pivot.loc[df_pivot['Total_Jobs'].idxmin()]
max_jobs_row = df_pivot.loc[df_pivot['Total_Jobs'].idxmax()]
min_jobs_state = min_jobs_row['STATE_NAME']
max_jobs_state = max_jobs_row['STATE_NAME']
min_jobs_count = min_jobs_row['Total_Jobs']
max_jobs_count = max_jobs_row['Total_Jobs']
# Step 6: Map state names to state codes for Plotly choropleth
state_name_to_code = {
'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR', 'California': 'CA',
'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE', 'Florida': 'FL', 'Georgia': 'GA',
'Hawaii': 'HI', 'Idaho': 'ID', 'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA',
'Kansas': 'KS', 'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME', 'Maryland': 'MD',
'Massachusetts': 'MA', 'Michigan': 'MI', 'Minnesota': 'MN', 'Mississippi': 'MS', 'Missouri': 'MO',
'Montana': 'MT', 'Nebraska': 'NE', 'Nevada': 'NV', 'New Hampshire': 'NH', 'New Jersey': 'NJ',
'New Mexico': 'NM', 'New York': 'NY', 'North Carolina': 'NC', 'North Dakota': 'ND', 'Ohio': 'OH',
'Oklahoma': 'OK', 'Oregon': 'OR', 'Pennsylvania': 'PA', 'Rhode Island': 'RI', 'South Carolina': 'SC',
'South Dakota': 'SD', 'Tennessee': 'TN', 'Texas': 'TX', 'Utah': 'UT', 'Vermont': 'VT',
'Virginia': 'VA', 'Washington': 'WA', 'West Virginia': 'WV', 'Wisconsin': 'WI', 'Wyoming': 'WY'
}
df_pivot['State_Code'] = df_pivot['STATE_NAME'].map(state_name_to_code)
# Step 7: Clean the data (remove rows with unmapped states)
df_pivot = df_pivot.dropna(subset=['State_Code'])
# Debug: Check the final data before plotting
print("Final data for plotting:")
print(df_pivot[['STATE_NAME', 'State_Code', 'Total_Jobs']])
# Step 8: Define the color scale range and ticks
min_range = 100
max_range = 8100
increment = 900 # To get 10 divisions between 100 and 8100
tickvals = list(range(min_range, max_range + increment, increment))
tickvals = tickvals[:10] # Ensure 8-10 divisions
ticktext = [str(val) for val in tickvals]
# Debug: Print the tick values for the color bar
print("Color bar tick values:", tickvals)
# Step 9: Create the choropleth map with a custom linear color scale
fig = px.choropleth(
df_pivot,
locations='State_Code',
locationmode='USA-states',
color='Total_Jobs',
color_continuous_scale='Reds', # Use a red gradient for impact
scope='usa',
range_color=[min_range, max_range], # Explicitly set the range
title='Geographic Distribution of Analytics Job Postings (2025)',
hover_data=['STATE_NAME', 'Analytics Job', 'Non-Analytics Job', 'Total_Jobs'],
labels={'Total_Jobs': 'Number of Jobs'}
)
# Step 10: Update layout for a stunning visualization
fig.update_layout(
width=1000,
height=700,
plot_bgcolor='white',
paper_bgcolor='white',
font=dict(family='Inter, sans-serif', size=14, color='#2D3748'),
title=dict(
text='Geographic Distribution of Analytics Job Postings (2025)',
font=dict(size=28, color='#FF6B6B', family='Inter, sans-serif'),
x=0.5,
xanchor='center',
y=0.95,
yanchor='top'
),
geo=dict(
bgcolor='white',
lakecolor='white',
landcolor='lightgray',
subunitcolor='black',
showlakes=True,
showsubunits=True,
showframe=True,
framecolor='#2D3748',
framewidth=2
),
coloraxis_colorbar=dict(
title='Number of Jobs',
title_font=dict(size=16, family='Inter, sans-serif', color='#FF6B6B'),
tickfont=dict(size=12, family='Inter, sans-serif', color='#2D3748'),
tickvals=tickvals,
ticktext=ticktext,
len=0.8,
thickness=20,
outlinecolor='#2D3748',
outlinewidth=1,
bgcolor='rgba(255,255,255,0.8)'
),
margin=dict(l=50, r=50, t=100, b=50)
)
# Step 11: Add annotations for Wyoming and Texas
fig.add_annotation(
x=0.05,
y=0.05,
xref="paper",
yref="paper",
text=f"Lowest: Wyoming ({min_jobs_count} jobs)",
showarrow=False,
font=dict(size=12, color='#FF6B6B', family='Inter, sans-serif'),
bgcolor='rgba(255,255,255,0.8)',
bordercolor='#FF6B6B',
borderwidth=1
)
fig.add_annotation(
x=0.95,
y=0.05,
xref="paper",
yref="paper",
text=f"Highest: Texas ({max_jobs_count} jobs)",
showarrow=False,
font=dict(size=12, color='#FF6B6B', family='Inter, sans-serif'),
bgcolor='rgba(255,255,255,0.8)',
bordercolor='#FF6B6B',
borderwidth=1,
xanchor='right'
)
# Step 12: Save to HTML
fig.write_html(
'figures/edaplot4.html',
include_plotlyjs='cdn',
full_html=False
)Number of rows after cleaning: 71254
Unique states extracted: ['Arkansas' 'Maine' 'Texas' 'Arizona' 'California' 'Ohio' 'New Jersey'
'New York' 'Hawaii' 'Georgia' 'Michigan' 'Mississippi' 'Massachusetts'
'Alaska' 'Alabama' 'Indiana' 'Virginia' 'South Carolina' 'Colorado'
'Nevada' 'Minnesota' 'Oregon' 'Oklahoma' 'North Carolina' 'Florida'
'Washington' 'Delaware' 'Illinois' 'Pennsylvania' 'Kansas' 'Tennessee'
'Washington, D.C. (District of Columbia)' 'Maryland' 'Idaho' 'Louisiana'
'Connecticut' 'Nebraska' 'Missouri' 'North Dakota' 'Utah' 'New Hampshire'
'Wisconsin' 'Kentucky' 'Rhode Island' 'Iowa' 'South Dakota' 'Montana'
'New Mexico' 'Wyoming' 'West Virginia' 'Vermont']
Aggregated data by state:
Job_Category STATE_NAME Analytics Job \
0 Alabama 376
1 Alaska 133
2 Arizona 673
3 Arkansas 252
4 California 3239
5 Colorado 646
6 Connecticut 390
7 Delaware 219
8 Florida 1526
9 Georgia 1144
10 Hawaii 140
11 Idaho 214
12 Illinois 1595
13 Indiana 442
14 Iowa 285
15 Kansas 330
16 Kentucky 279
17 Louisiana 230
18 Maine 193
19 Maryland 791
20 Massachusetts 987
21 Michigan 709
22 Minnesota 709
23 Mississippi 267
24 Missouri 587
25 Montana 105
26 Nebraska 221
27 Nevada 237
28 New Hampshire 134
29 New Jersey 1022
30 New Mexico 163
31 New York 1752
32 North Carolina 1284
33 North Dakota 96
34 Ohio 1149
35 Oklahoma 280
36 Oregon 468
37 Pennsylvania 962
38 Rhode Island 253
39 South Carolina 333
40 South Dakota 149
41 Tennessee 599
42 Texas 2967
43 Utah 350
44 Vermont 118
45 Virginia 1906
46 Washington 856
47 Washington, D.C. (District of Columbia) 668
48 West Virginia 76
49 Wisconsin 477
50 Wyoming 61
Job_Category Non-Analytics Job Total_Jobs
0 282 658
1 101 234
2 932 1605
3 299 551
4 3813 7052
5 786 1432
6 447 837
7 214 433
8 2078 3604
9 1481 2625
10 97 237
11 232 446
12 1912 3507
13 487 929
14 307 592
15 382 712
16 325 604
17 198 428
18 149 342
19 548 1339
20 1027 2014
21 1097 1806
22 740 1449
23 172 439
24 611 1198
25 78 183
26 291 512
27 319 556
28 133 267
29 1569 2591
30 91 254
31 1579 3331
32 1426 2710
33 50 146
34 1442 2591
35 298 578
36 593 1061
37 1284 2246
38 176 429
39 288 621
40 139 288
41 643 1242
42 5083 8050
43 259 609
44 108 226
45 1685 3591
46 744 1600
47 551 1219
48 80 156
49 544 1021
50 42 103
Final data for plotting:
Job_Category STATE_NAME State_Code Total_Jobs
0 Alabama AL 658
1 Alaska AK 234
2 Arizona AZ 1605
3 Arkansas AR 551
4 California CA 7052
5 Colorado CO 1432
6 Connecticut CT 837
7 Delaware DE 433
8 Florida FL 3604
9 Georgia GA 2625
10 Hawaii HI 237
11 Idaho ID 446
12 Illinois IL 3507
13 Indiana IN 929
14 Iowa IA 592
15 Kansas KS 712
16 Kentucky KY 604
17 Louisiana LA 428
18 Maine ME 342
19 Maryland MD 1339
20 Massachusetts MA 2014
21 Michigan MI 1806
22 Minnesota MN 1449
23 Mississippi MS 439
24 Missouri MO 1198
25 Montana MT 183
26 Nebraska NE 512
27 Nevada NV 556
28 New Hampshire NH 267
29 New Jersey NJ 2591
30 New Mexico NM 254
31 New York NY 3331
32 North Carolina NC 2710
33 North Dakota ND 146
34 Ohio OH 2591
35 Oklahoma OK 578
36 Oregon OR 1061
37 Pennsylvania PA 2246
38 Rhode Island RI 429
39 South Carolina SC 621
40 South Dakota SD 288
41 Tennessee TN 1242
42 Texas TX 8050
43 Utah UT 609
44 Vermont VT 226
45 Virginia VA 3591
46 Washington WA 1600
48 West Virginia WV 156
49 Wisconsin WI 1021
50 Wyoming WY 103
Color bar tick values: [100, 1000, 1900, 2800, 3700, 4600, 5500, 6400, 7300, 8200]
import plotly.graph_objects as go
import pandas as pd
# Assuming eda is already loaded and DATA_ANALYST_JOB is defined
df = eda.copy()
# Step 1: Clean the data (remove rows with missing values for key columns)
df = df.dropna(subset=['MIN_EDULEVELS_NAME', 'NAICS2_NAME'])
# Step 2: Map DATA_ANALYST_JOB to labels
df['Job_Category'] = df['DATA_ANALYST_JOB'].map({True: 'Analytics Job', False: 'Non-Analytics Job'})
# Step 3: Shorten industry names for better display
short_names = {
'Professional, Scientific, and Technical Services': 'Tech. Services',
'Administrative and Support and Waste Management and Remediation Services': 'Admin & Waste Mgmt',
'Health Care and Social Assistance': 'Healthcare',
'Finance and Insurance': 'Finance',
'Information': 'Info Tech',
'Educational Services': 'Education',
'Manufacturing': 'Manufacturing',
'Retail Trade': 'Retail',
'Accommodation and Food Services': 'Hospitality',
'Other Services (except Public Administration)': 'Other Services'
}
df['Industry'] = df['NAICS2_NAME'].map(short_names).fillna(df['NAICS2_NAME'])
# Step 4: Get the top 5 industries by total job count to keep the diagram manageable
total_jobs_by_industry = df.groupby('Industry').size().nlargest(5)
top_industries = total_jobs_by_industry.index.tolist()
df = df[df['Industry'].isin(top_industries)]
# Step 5: Aggregate data to get flows
# Flow from Education Level to Job Category
edu_to_job = df.groupby(['MIN_EDULEVELS_NAME', 'Job_Category']).size().reset_index(name='Count')
# Flow from Job Category to Industry
job_to_industry = df.groupby(['Job_Category', 'Industry']).size().reset_index(name='Count')
# Debug: Check the aggregated data
print("Flow from Education Level to Job Category:")
print(edu_to_job)
print("Flow from Job Category to Industry:")
print(job_to_industry)
# Step 6: Create nodes and links for the Sankey diagram
# Nodes: Education Levels + Job Categories + Industries
edu_levels = list(df['MIN_EDULEVELS_NAME'].unique())
job_categories = list(df['Job_Category'].unique())
industries = list(df['Industry'].unique())
# Create a list of all nodes
all_nodes = edu_levels + job_categories + industries
node_indices = {node: idx for idx, node in enumerate(all_nodes)}
# Links: Education Level -> Job Category
links_source = []
links_target = []
links_value = []
for _, row in edu_to_job.iterrows():
source = node_indices[row['MIN_EDULEVELS_NAME']]
target = node_indices[row['Job_Category']]
value = row['Count']
links_source.append(source)
links_target.append(target)
links_value.append(value)
# Links: Job Category -> Industry
for _, row in job_to_industry.iterrows():
source = node_indices[row['Job_Category']]
target = node_indices[row['Industry']]
value = row['Count']
links_source.append(source)
links_target.append(target)
links_value.append(value)
# Satep 7: Create the Sankey diagram
fig = go.Figure(data=[go.Sankey(
node=dict(
pad=15,
thickness=20,
line=dict(color='#2D3748', width=0.5),
label=all_nodes,
color=['#FF6B6B' if 'Job' in node else '#FFE5E5' for node in all_nodes] # Red for job categories, light red for others
),
link=dict(
source=links_source,
target=links_target,
value=links_value,
color=['#FF6B6B' if all_nodes[source] in job_categories else '#FFE5E5' for source in links_source] # Red links for job category flows
)
)])
# Step 8: Update layout for a stunning visualization
fig.update_layout(
title=dict(
text='Flow of Jobs: Education Level → Job Category → Industry (2025)',
font=dict(size=28, color='#FF6B6B', family='Inter, sans-serif'),
x=0.5,
xanchor='center',
y=0.95,
yanchor='top'
),
width=1000,
height=600,
font=dict(family='Inter, sans-serif', size=14, color='#2D3748'),
plot_bgcolor='white',
paper_bgcolor='white',
margin=dict(l=50, r=50, t=100, b=50)
)
# Step 9: Add annotations for context
fig.add_annotation(
x=0.05,
y=0.05,
xref="paper",
yref="paper",
text="Flow represents job counts across categories",
showarrow=False,
font=dict(size=12, color='#FF6B6B', family='Inter, sans-serif'),
bgcolor='rgba(255,255,255,0.8)',
bordercolor='#FF6B6B',
borderwidth=1
)
# Step 10: Save to HTML
fig.write_html(
'figures/edaplot5.html',
include_plotlyjs='cdn',
full_html=False
)Flow from Education Level to Job Category:
MIN_EDULEVELS_NAME Job_Category Count
0 Associate degree Analytics Job 495
1 Associate degree Non-Analytics Job 1618
2 Bachelor's degree Analytics Job 14718
3 Bachelor's degree Non-Analytics Job 14576
4 High school or GED Analytics Job 919
5 High school or GED Non-Analytics Job 1510
6 Master's degree Analytics Job 564
7 Master's degree Non-Analytics Job 572
8 No Education Listed Analytics Job 5955
9 No Education Listed Non-Analytics Job 11856
10 Ph.D. or professional degree Analytics Job 22
11 Ph.D. or professional degree Non-Analytics Job 27
Flow from Job Category to Industry:
Job_Category Industry Count
0 Analytics Job Admin & Waste Mgmt 4105
1 Analytics Job Finance 4246
2 Analytics Job Manufacturing 2554
3 Analytics Job Tech. Services 7620
4 Analytics Job Unclassified Industry 4148
5 Non-Analytics Job Admin & Waste Mgmt 4296
6 Non-Analytics Job Finance 2886
7 Non-Analytics Job Manufacturing 2169
8 Non-Analytics Job Tech. Services 15552
9 Non-Analytics Job Unclassified Industry 5256